#!/usr/bin/perl

# Last Version:
# https://unicode.org/Public/idna/latest/
# https://www.unicode.org/reports/tr46

# usage:
# idna_test.pl <IdnaTestV2.txt>

use utf8;
use strict;
use FindBin;
use warnings FATAL => 'all';

binmode(STDOUT, "encoding(UTF-8)");

my $unicode = new IDNATest $ARGV[0];
my $result = $unicode->build();
$unicode->save("$FindBin::RealBin/../../../test/lexbor/unicode/unicode_idna_test_res.h");


package IDNATest;

sub new {
	my ($class, $filepath) = @_;
	my ($data, $p, $size, $i, $begin, $nline, $dec_types, $dec_types_raw, @res);
	my ($parts);

	my $self = {
        filepath   => $filepath,
        data       => [],
		result     => [],
		codepoints => [],
		_prefix    => "lxb_unicode_idna_test"
    };

	bless $self, $class;

	open my $fh, "<:raw", $filepath || die "Failed to open file: $filepath";
	binmode $fh;

	$size = -s $filepath;

	read $fh, $p, $size;
	@res = split '', $p;

	$p = utf_8_to_codepoint(\@res);
	$i = 0;

	while ($i < @$p) {
		$begin = $i;

		$nline = get_from_line($p, "\n", $begin, scalar @$p);
		$i = get_from_line($p, "#", $begin, $nline);

		unless ($begin < $i) {
			$i = $nline + 1;
			next;
		}

		$parts = split_line($p, ";", $begin, $i);
		parts_leading_trailing($parts);

		my $source = to_c_string($parts->[0]);
		my $toUnicode = to_c_string($parts->[1]);
		my $toUnicodeStatus = to_c_string($parts->[2]);
		my $toAsciiN = to_c_string($parts->[3]);
		my $toAsciiNStatus = to_c_string($parts->[4]);
		my $toAsciiT = to_c_string($parts->[5]);
		my $toAsciiTStatus = to_c_string($parts->[6]);

		my $ref_toUnicode = (!defined $toUnicode) ? $source : $toUnicode;
		my $ref_toAsciiN = (!defined $toAsciiN) ? $ref_toUnicode : $toAsciiN;

		my $status = (defined $toUnicodeStatus) ? $toUnicodeStatus : $toAsciiNStatus;
		$status = "" unless defined $status;

		push @$data, [$source, $ref_toAsciiN, $status];

		$i = $nline + 1;
	}

	close($fh);

	$self->{data} = $data;

    return $self;
}

sub build {
	my $self = shift;
	my ($entry, $result, $source, $ascii, $src_len, $ascii_len, $status, $count);
	my $data = $self->{data};

	$count = 0;
	$result = [];

	foreach my $idx (0..@$data - 1) {
		$entry = $data->[$idx];
		$source = $entry->[0];
		$ascii = $entry->[1];
		$status = length $entry->[2];

		push @$result, "    {.source = (const lxb_char_t *) \"$source\", .ascii = (const lxb_char_t *) \"" . $ascii . "\""
		                    .", .status = $status} /* $idx */";
	}

	push @$result, "    {.source = NULL, .ascii = NULL, .status = 0}";

	my $name = $self->lxb_prefix("entries");
	my $entries = "static const lxb_unicode_idna_test_t $name\[\] = {\n" . join(",\n", @$result) . "\n};";

	$self->{result} = $entries;

	return $entries;
}

sub make_entry {
	my ($self, $list, @name) = @_;
	my $count = scalar @$list;

	my $st;
	my $name = $self->lxb_prefix(@name);

	$st = "static const lxb_codepoint_t $name\[$count\] = {";

	if (scalar @$list > 0) {
		$st .= "0x". join(", 0x", @$list) ."};";
	}
	else {
		$st .= "};";
	}

	push @{$self->{codepoints}}, $st;

	return $name;
}

sub save {
	my ($self, $filepath) = @_;
	my $year = 1900 + (localtime)[5];
	my $cps = join("\n", @{$self->{codepoints}});
	my $res = $self->{result};
	my $temp = <<EOM;
/*
 * Copyright (C) $year Alexander Borisov
 *
 * Author: Alexander Borisov <borisov\@lexbor.com>
 */

/*
 * Caution!
 * This file generated by script "utils/lexbor/unicode/idna_test.pl"!
 * Do not change this file!
 */

#ifndef LEXBOR_UNICODE_IDNA_TEST_RES_H
#define LEXBOR_UNICODE_IDNA_TEST_RES_H

#ifdef __cplusplus
extern "C" {
#endif

$cps

$res


#ifdef __cplusplus
} /* extern "C" */
#endif

#endif /* LEXBOR_UNICODE_IDNA_TEST_RES_H */
EOM

	open(my $fh, '>', $filepath) || die "Failed to save to file: $filepath";
	binmode $fh;

	print $fh $temp, "\n";

	close($fh);
}

sub lxb_prefix {
	my $self = shift;

	return join "_", $self->{_prefix}, @_;
}

sub codepoint_to_utf_8 {
	my ($cp) = @_;
	my @utf_8;

	$cp = hex($cp);

    if ($cp <0x80) {
        # 0xxxxxxx
		push @utf_8, sprintf("\\x%02X", $cp);

        return join("", @utf_8);
    }

    if ($cp < 0x800) {
        # 110xxxxx 10xxxxxx
		push @utf_8, sprintf("\\x%02X", 0xC0 | ($cp >> 6  ));
		push @utf_8, sprintf("\\x%02X", 0x80 | ($cp & 0x3F));

        return join("", @utf_8);
    }

    if ($cp < 0x10000) {
        # 1110xxxx 10xxxxxx 10xxxxxx
        push @utf_8, sprintf("\\x%02X", 0xE0 | (($cp >> 12)));
        push @utf_8, sprintf("\\x%02X", 0x80 | (($cp >> 6 ) & 0x3F));
        push @utf_8, sprintf("\\x%02X", 0x80 | ( $cp        & 0x3F));

        return join("", @utf_8);
    }

    if ($cp < 0x110000) {
        # 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
        push @utf_8, sprintf("\\x%02X", 0xF0 | ( $cp >> 18));
        push @utf_8, sprintf("\\x%02X", 0x80 | (($cp >> 12) & 0x3F));
        push @utf_8, sprintf("\\x%02X", 0x80 | (($cp >> 6 ) & 0x3F));
        push @utf_8, sprintf("\\x%02X", 0x80 | ( $cp        & 0x3F));

        return join("", @utf_8);
    }

    die "Failed to convert codepoint to UTF-8: $cp";
}

sub utf_8_to_codepoint {
	my $data = shift;
	my ($cp, $en, $i, @res);

	$i = 0;

	while (exists $data->[$i]) {
		if (ord($data->[$i]) < 0x80){
			# 0xxxxxxx

			$cp = ord($data->[$i]);

			# \uXXXX or \x{XXXX}
			if ($cp == ord("\\") && exists $data->[$i + 5]) {
				$en = ord($data->[$i + 1]);

				if ($en == ord("u")) {
					$en = join "", @{$data}[$i + 2..$i + 5];

					if ($en =~ /^[a-fA-F0-9]{4}$/) {
						$cp = hex($en);
						$i += 5;
					}
				}
				elsif ($en == ord("x") && exists $data->[$i + 7]
					   && ord($data->[$i + 2]) == ord("{")
					   && ord($data->[$i + 7]) == ord("}"))
				{
					$en = join "", @{$data}[$i + 3..$i + 6];

					if ($en =~ /^[a-fA-F0-9]{4}$/) {
						$cp = hex($en);
						$i += 7;
					}
				}
			}

			$i += 1;
		}
		elsif ((ord($data->[$i]) & 0xe0) == 0xc0) {
			# 110xxxxx 10xxxxxx

			die "bad UTF-8 sequences" unless exists $data->[$i + 1];

			$cp  = (ord($data->[$i])     ^ (0xC0 & ord($data->[$i]))) << 6;
			$cp |= (ord($data->[$i + 1]) ^ (0x80 & ord($data->[$i + 1])));

			$i += 2;
		}
		elsif ((ord($data->[$i]) & 0xf0) == 0xe0) {
			# 1110xxxx 10xxxxxx 10xxxxxx

			die "bad UTF-8 sequences" unless exists $data->[$i + 2];

			$cp  = (ord($data->[$i])     ^ (0xE0 & ord($data->[$i])))     << 12;
			$cp |= (ord($data->[$i + 1]) ^ (0x80 & ord($data->[$i + 1]))) << 6;
			$cp |= (ord($data->[$i + 2]) ^ (0x80 & ord($data->[$i + 2])));

			$i += 3;
		}
		elsif ((ord($data->[$i]) & 0xf8) == 0xf0) {
			# 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx

			die "bad UTF-8 sequences" unless exists $data->[$i + 3];

			$cp  = (ord($data->[$i])     ^ (0xF0 & ord($data->[$i])))     << 18;
			$cp |= (ord($data->[$i + 1]) ^ (0x80 & ord($data->[$i + 1]))) << 12;
			$cp |= (ord($data->[$i + 2]) ^ (0x80 & ord($data->[$i + 2]))) << 6;
			$cp |= (ord($data->[$i + 3]) ^ (0x80 & ord($data->[$i + 3])));

			$i += 4;
		}
		else {
			die "unsupport UTF-8 sequences";
		}

		push @res, sprintf("%04X", $cp);
	}

    return \@res;
}

sub get_from_line {
	my ($data, $chr, $begin, $end) = @_;

	while ($begin < $end) {
		last if hex($data->[$begin]) == ord($chr);
		$begin += 1;
	}

	return $begin;
}

sub split_line {
	my ($data, $chr, $begin, $end) = @_;
	my (@result, $part);

	$part = [];

	while ($begin < $end) {
		if (hex($data->[$begin]) == ord($chr)) {
			push @result, $part;

			$part = [];
			$begin += 1;

			next;
		}

		push @$part, $data->[$begin];
		$begin += 1;
	}

	if (@$part != 0) {
		push @result, $part;
	}

	return \@result;
}

sub parts_leading_trailing {
	my ($parts) = @_;
	my ($cp, $begin, $end);

	foreach my $entry (@$parts) {
		$begin = 0;

		while ($begin < @$entry) {
			$cp = hex($entry->[$begin]);

			last if $cp != ord(" ") && $cp != ord("\t");

			$begin += 1;
		}

		$end = @$entry;

		while ($end > 0) {
			$end -= 1;
			$cp = hex($entry->[$end]);

			last if $cp != ord(" ") && $cp != ord("\t");
		}

		@$entry = @{$entry}[$begin..$end];
	}
}

sub to_c_string {
	my ($part) = @_;
	my ($cp, @res);

	return undef unless @$part;

	if (@$part == 2) {
		if (hex($part->[0]) == ord('"') && hex($part->[1]) == ord('"')) {
			return "";
		}
	}

	foreach my $cp (@$part) {
		push @res, codepoint_to_utf_8($cp);
	}

	return join "", @res;
}
